In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os
import glob
import numpy as np
import datetime as dt
np.random.seed(42)

Each month is stored in a csv file so we will merge them all together.

  1. Import packages and set the working directory.
  2. Use glob to match the pattern 'csv' ...
  3. Combine all files in the list and export as CSV.
In [2]:
extension="csv"
all_files=[i for i in glob.glob("Data/*.{}".format(extension))]

df=pd.concat([pd.read_csv(file) for file in all_files])
df.to_csv("combined_tripdata.csv",index=False)
print(df.shape)
df.head()
/Users/abdullahabu-sabba/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3254: DtypeWarning: Columns (13,14) have mixed types.Specify dtype option on import or set low_memory=False.
  if (await self.run_code(code, result,  async_=asy)):
/Users/abdullahabu-sabba/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3254: DtypeWarning: Columns (14) have mixed types.Specify dtype option on import or set low_memory=False.
  if (await self.run_code(code, result,  async_=asy)):
/Users/abdullahabu-sabba/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3254: DtypeWarning: Columns (13) have mixed types.Specify dtype option on import or set low_memory=False.
  if (await self.run_code(code, result,  async_=asy)):
(2506983, 15)
Out[2]:
duration_sec start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip rental_access_method
0 60863 2019-09-30 11:48:02.7100 2019-10-01 04:42:25.8640 465.0 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 465.0 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 12904 Customer No NaN
1 36019 2019-09-30 16:16:32.3530 2019-10-01 02:16:51.9820 294.0 Pierce Ave at Market St 37.327581 -121.884559 443.0 3rd St at Keyes St 37.320866 -121.876279 163 Customer No NaN
2 5615 2019-09-30 23:12:25.9980 2019-10-01 00:46:01.9590 370.0 Jones St at Post St 37.787327 -122.413278 4.0 Cyril Magnin St at Ellis St 37.785881 -122.408915 10737 Customer No NaN
3 1482 2019-09-30 23:57:34.6630 2019-10-01 00:22:16.8490 109.0 17th St at Valencia St 37.763316 -122.421904 460.0 Terry Francois Blvd at Warriors Way 37.769095 -122.386333 10346 Customer No NaN
4 1272 2019-09-30 23:53:28.6530 2019-10-01 00:14:41.0740 95.0 Sanchez St at 15th St 37.766219 -122.431060 127.0 Valencia St at 21st St 37.756708 -122.421025 1349 Subscriber No NaN
In [3]:
df=pd.read_csv("combined_tripdata.csv")
df.head()
/Users/abdullahabu-sabba/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (13,14) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
duration_sec start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip rental_access_method
0 60863 2019-09-30 11:48:02.7100 2019-10-01 04:42:25.8640 465.0 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 465.0 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 12904 Customer No NaN
1 36019 2019-09-30 16:16:32.3530 2019-10-01 02:16:51.9820 294.0 Pierce Ave at Market St 37.327581 -121.884559 443.0 3rd St at Keyes St 37.320866 -121.876279 163 Customer No NaN
2 5615 2019-09-30 23:12:25.9980 2019-10-01 00:46:01.9590 370.0 Jones St at Post St 37.787327 -122.413278 4.0 Cyril Magnin St at Ellis St 37.785881 -122.408915 10737 Customer No NaN
3 1482 2019-09-30 23:57:34.6630 2019-10-01 00:22:16.8490 109.0 17th St at Valencia St 37.763316 -122.421904 460.0 Terry Francois Blvd at Warriors Way 37.769095 -122.386333 10346 Customer No NaN
4 1272 2019-09-30 23:53:28.6530 2019-10-01 00:14:41.0740 95.0 Sanchez St at 15th St 37.766219 -122.431060 127.0 Valencia St at 21st St 37.756708 -122.421025 1349 Subscriber No NaN
In [4]:
df_copy=df.copy()

The rental access_method column will be dropped because it's only available in the following csv files:

  • Data/201906-baywheels-tripdata.csv
  • Data/201907-baywheels-tripdata.csv
  • Data/201911-baywheels-tripdata.csv
  • Data/201912-baywheels-tripdata.csv
In [5]:
df.drop(labels="rental_access_method",axis=1,inplace=True)

Data exploring and cleaning.

In [6]:
from pandas_profiling import ProfileReport
profile=ProfileReport(df,title="combined_tripdata.csv",explorative=True)
profile.to_widgets()


In [7]:
profile.to_widgets()
In [8]:
profile.to_notebook_iframe()

In [9]:
profile.to_file("report.html")

In [10]:
df.columns
Out[10]:
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
       'start_station_name', 'start_station_latitude',
       'start_station_longitude', 'end_station_id', 'end_station_name',
       'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
       'bike_share_for_all_trip'],
      dtype='object')
In [11]:
df.shape
Out[11]:
(2506983, 14)
In [12]:
df.info(verbose=True,null_counts=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506983 entries, 0 to 2506982
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   duration_sec             2506983 non-null  int64  
 1   start_time               2506983 non-null  object 
 2   end_time                 2506983 non-null  object 
 3   start_station_id         2426249 non-null  float64
 4   start_station_name       2426850 non-null  object 
 5   start_station_latitude   2506983 non-null  float64
 6   start_station_longitude  2506983 non-null  float64
 7   end_station_id           2424081 non-null  float64
 8   end_station_name         2424633 non-null  object 
 9   end_station_latitude     2506983 non-null  float64
 10  end_station_longitude    2506983 non-null  float64
 11  bike_id                  2506983 non-null  int64  
 12  user_type                2506983 non-null  object 
 13  bike_share_for_all_trip  2263724 non-null  object 
dtypes: float64(6), int64(2), object(6)
memory usage: 267.8+ MB
In [13]:
pd.set_option('float_format', '{:f}'.format)
df.describe()
Out[13]:
duration_sec start_station_id start_station_latitude start_station_longitude end_station_id end_station_latitude end_station_longitude bike_id
count 2506983.000000 2426249.000000 2506983.000000 2506983.000000 2424081.000000 2506983.000000 2506983.000000 2506983.000000
mean 807.648330 146.504749 37.765064 -122.349919 142.704424 37.764219 -122.345908 27898.327162
std 1974.713981 122.317102 0.136370 0.308965 121.429649 0.239289 0.708042 114606.651187
min 60.000000 3.000000 0.000000 -122.514299 3.000000 0.000000 -122.514287 4.000000
25% 359.000000 47.000000 37.769305 -122.413004 43.000000 37.770030 -122.411726 1952.000000
50% 571.000000 105.000000 37.780526 -122.398285 101.000000 37.780760 -122.398113 4420.000000
75% 887.000000 243.000000 37.795393 -122.291415 239.000000 37.795873 -122.293400 9682.000000
max 912110.000000 498.000000 45.510000 0.000000 498.000000 45.510000 0.000000 999941.000000

Percentage of null for each columns.

In [14]:
for index,name in enumerate(df.columns):
    null_count=df[name].isnull().sum();
    Percentage=(null_count/df.shape[0])*100
    print("Percentage of null: {0:.2f}% for column:{1}".format(Percentage,name))
Percentage of null: 0.00% for column:duration_sec
Percentage of null: 0.00% for column:start_time
Percentage of null: 0.00% for column:end_time
Percentage of null: 3.22% for column:start_station_id
Percentage of null: 3.20% for column:start_station_name
Percentage of null: 0.00% for column:start_station_latitude
Percentage of null: 0.00% for column:start_station_longitude
Percentage of null: 3.31% for column:end_station_id
Percentage of null: 3.28% for column:end_station_name
Percentage of null: 0.00% for column:end_station_latitude
Percentage of null: 0.00% for column:end_station_longitude
Percentage of null: 0.00% for column:bike_id
Percentage of null: 0.00% for column:user_type
Percentage of null: 9.70% for column:bike_share_for_all_trip
In [15]:
df.isnull().sum()
Out[15]:
duration_sec                    0
start_time                      0
end_time                        0
start_station_id            80734
start_station_name          80133
start_station_latitude          0
start_station_longitude         0
end_station_id              82902
end_station_name            82350
end_station_latitude            0
end_station_longitude           0
bike_id                         0
user_type                       0
bike_share_for_all_trip    243259
dtype: int64

Define: drop null.

since the null value dosent represent larg percent on the data set, dropping them will not affect the analyzes results

In [16]:
# code.
df.dropna(inplace=True)
#test
df.isnull().sum()
Out[16]:
duration_sec               0
start_time                 0
end_time                   0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
bike_id                    0
user_type                  0
bike_share_for_all_trip    0
dtype: int64
In [17]:
print(df.shape)
print(2506983-2262979)
(2262979, 14)
244004

Define:

change start and ending time to datetime object.

In [18]:
#code
df["start_time"]=pd.to_datetime(df["start_time"])
df["end_time"]=pd.to_datetime(df["end_time"])
In [19]:
#test
df.info(verbose=True,null_counts=True)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2262979 entries, 0 to 2506982
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   duration_sec             2262979 non-null  int64         
 1   start_time               2262979 non-null  datetime64[ns]
 2   end_time                 2262979 non-null  datetime64[ns]
 3   start_station_id         2262979 non-null  float64       
 4   start_station_name       2262979 non-null  object        
 5   start_station_latitude   2262979 non-null  float64       
 6   start_station_longitude  2262979 non-null  float64       
 7   end_station_id           2262979 non-null  float64       
 8   end_station_name         2262979 non-null  object        
 9   end_station_latitude     2262979 non-null  float64       
 10  end_station_longitude    2262979 non-null  float64       
 11  bike_id                  2262979 non-null  int64         
 12  user_type                2262979 non-null  object        
 13  bike_share_for_all_trip  2262979 non-null  object        
dtypes: datetime64[ns](2), float64(6), int64(2), object(4)
memory usage: 259.0+ MB
In [20]:
df.bike_share_for_all_trip.value_counts()
Out[20]:
No     2085442
Yes     177537
Name: bike_share_for_all_trip, dtype: int64

Define:

change values yes,no to boolean true and false.

In [21]:
#code 
df.replace(to_replace="No",value=False,inplace=True)
df.replace(to_replace="Yes",value=True,inplace=True)

#test
df.bike_share_for_all_trip.value_counts()
Out[21]:
False    2085442
True      177537
Name: bike_share_for_all_trip, dtype: int64
In [22]:
df.duration_sec.value_counts()[:10]
Out[22]:
339    2996
372    2990
324    2948
355    2944
370    2943
353    2935
381    2934
383    2926
313    2923
386    2922
Name: duration_sec, dtype: int64
In [23]:
df.duration_sec[:10]
Out[23]:
0    60863
1    36019
2     5615
3     1482
4     1272
5     8648
6      644
7      459
8      625
9     1177
Name: duration_sec, dtype: int64

Define:

setting duration to timedelta64[ns] dtype format as hh:mm:ss and changing column name to 'duration'

In [24]:
# setting duration to minutes and change columns name
#code
df.duration_sec=df.duration_sec/60
df.rename(columns={"duration_sec":"duration_min"},inplace=True)
#test
In [25]:
df.duplicated().sum()
Out[25]:
0
In [26]:
df.user_type.value_counts()
Out[26]:
Subscriber    1860795
Customer       402184
Name: user_type, dtype: int64
In [27]:
df.user_type.value_counts()
Out[27]:
Subscriber    1860795
Customer       402184
Name: user_type, dtype: int64
In [28]:
df.start_station_name.value_counts()[:10]
Out[28]:
Market St at 10th St                                         42089
Berry St at 4th St                                           39803
San Francisco Ferry Building (Harry Bridges Plaza)           36928
San Francisco Caltrain (Townsend St at 4th St)               36843
San Francisco Caltrain Station 2  (Townsend St at 4th St)    35799
Montgomery St BART Station (Market St at 2nd St)             33960
Powell St BART Station (Market St at 4th St)                 32260
Steuart St at Market St                                      30305
Howard St at Beale St                                        28948
Powell St BART Station (Market St at 5th St)                 28873
Name: start_station_name, dtype: int64
In [29]:
df.end_station_name.value_counts()[:10]
Out[29]:
San Francisco Caltrain Station 2  (Townsend St at 4th St)    51343
San Francisco Caltrain (Townsend St at 4th St)               45747
San Francisco Ferry Building (Harry Bridges Plaza)           45458
Montgomery St BART Station (Market St at 2nd St)             44676
Market St at 10th St                                         41412
Berry St at 4th St                                           38988
Powell St BART Station (Market St at 4th St)                 34813
The Embarcadero at Sansome St                                31655
Steuart St at Market St                                      31097
Powell St BART Station (Market St at 5th St)                 29706
Name: end_station_name, dtype: int64
In [30]:
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

Haversine formula

Haversine Formula – Calculate geographic distance on earth. If you have two different latitude – longitude values of two different point on earth, then with the help of Haversine Formula, you can easily compute the great-circle distance (The shortest distance between two points on the surface of a Sphere). The term Haversine was coined by Prof. James Inman in 1835. Haversine is very popular and frequently used formula when developing a GIS (Geographic Information System) application or analyzing path and fields.

sources.

wikipedia page Haversine

stackoverflow Question-1

stackoverflow Question-2

I want to have the distance between the start and end location so the below cordinate is of a location that I know to test the formula

In [31]:
lat1=37.775745
lon1=-122.213037
lat2=37.808894
lon2=-122.25646
haversine(lat1,lon1,lat2,lon2)
Out[31]:
5.305220342028568
In [32]:
from IPython.display import Image
Image (filename="img.png")
Out[32]:

looks great, this is the first start and ending point of our data set before cleaning and now we can apply the function to it.

In [33]:
df[["start_station_latitude",
        "start_station_longitude",
        "end_station_latitude",
        "end_station_longitude"]].head(1)
Out[33]:
start_station_latitude start_station_longitude end_station_latitude end_station_longitude
0 37.776329 -122.394438 37.776329 -122.394438

Define:

add distance column and apply the haversine function to cordinantes column.

In [34]:
# code: applying the function to the dataframe
df["distance_km"]=df.apply(lambda x: haversine(x["start_station_latitude"],
                                                        x["start_station_longitude"],
                                                       x["end_station_latitude"],
                                                       x["end_station_longitude"]),axis=1)
In [35]:
#test
df.sort_values(by="distance_km",ascending=False).head()
Out[35]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km
2147137 1100.616667 2019-11-25 16:40:51.382 2019-11-26 11:01:28.665 485.000000 Funston Ave at Fulton St 37.773263 -122.471230 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12577 Customer False 12799.727782
233272 24.850000 2019-04-28 18:40:41.288 2019-04-28 19:05:33.063 377.000000 Fell St at Stanyan St 37.771917 -122.453704 420.000000 SF Test Station 0.000000 0.000000 150 Customer False 12798.346860
233473 25.050000 2019-04-28 17:52:22.210 2019-04-28 18:17:25.545 377.000000 Fell St at Stanyan St 37.771917 -122.453704 420.000000 SF Test Station 0.000000 0.000000 901 Subscriber False 12798.346860
233472 25.016667 2019-04-28 17:52:27.345 2019-04-28 18:17:28.410 377.000000 Fell St at Stanyan St 37.771917 -122.453704 420.000000 SF Test Station 0.000000 0.000000 3632 Subscriber False 12798.346860
233441 15.066667 2019-04-28 18:08:13.576 2019-04-28 18:23:18.367 412.000000 Glen Park BART Station 37.732813 -122.434140 420.000000 SF Test Station 0.000000 0.000000 2760 Customer False 12798.321047
In [36]:
df.sort_values(by="distance_km",ascending=True).head()
Out[36]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km
0 1014.383333 2019-09-30 11:48:02.710 2019-10-01 04:42:25.864 465.000000 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 465.000000 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 12904 Customer False 0.000000
1481196 26.150000 2019-05-13 08:06:54.983 2019-05-13 08:33:04.484 15.000000 San Francisco Ferry Building (Harry Bridges Pl... 37.795392 -122.394203 15.000000 San Francisco Ferry Building (Harry Bridges Pl... 37.795392 -122.394203 572 Customer False 0.000000
1481477 8.300000 2019-05-13 08:09:01.332 2019-05-13 08:17:20.036 235.000000 Union St at 10th St 37.807239 -122.289370 235.000000 Union St at 10th St 37.807239 -122.289370 417 Subscriber False 0.000000
1481489 48.516667 2019-05-13 07:28:13.333 2019-05-13 08:16:44.748 245.000000 Downtown Berkeley BART 37.870139 -122.268422 245.000000 Downtown Berkeley BART 37.870139 -122.268422 593 Subscriber False 0.000000
1481515 7.383333 2019-05-13 08:07:02.256 2019-05-13 08:14:25.820 308.000000 San Pedro Square 37.336802 -121.894090 308.000000 San Pedro Square 37.336802 -121.894090 3985 Subscriber False 0.000000
In [37]:
print("99.9%:",np.percentile(df.distance_km,99.9))
print("100%:",np.percentile(df.distance_km,100))
99.9%: 6.953565105662369
100%: 12799.727782102611
In [38]:
print(df[df["distance_km"]>10_000].shape)
df[df["distance_km"]>10_000]
(49, 15)
Out[38]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km
219163 39.416667 2019-04-30 18:21:42.610 2019-04-30 19:01:08.146 50.000000 2nd St at Townsend St 37.780526 -122.390288 420.000000 SF Test Station 0.000000 0.000000 1612 Subscriber False 12792.805424
219929 32.733333 2019-04-30 17:31:19.646 2019-04-30 18:04:04.518 98.000000 Valencia St at 16th St 37.765052 -122.421866 420.000000 SF Test Station 0.000000 0.000000 3507 Subscriber False 12796.015813
220083 9.100000 2019-04-30 17:45:38.452 2019-04-30 17:54:45.258 370.000000 Jones St at Post St 37.787327 -122.413278 420.000000 SF Test Station 0.000000 0.000000 1142 Subscriber True 12794.414400
220162 10.500000 2019-04-30 17:39:33.507 2019-04-30 17:50:04.314 21.000000 Montgomery St BART Station (Market St at 2nd St) 37.789625 -122.400811 420.000000 SF Test Station 0.000000 0.000000 3500 Subscriber False 12793.300708
220615 7.216667 2019-04-30 17:17:10.486 2019-04-30 17:24:23.919 11.000000 Davis St at Jackson St 37.797280 -122.398436 420.000000 SF Test Station 0.000000 0.000000 1181 Subscriber False 12792.797705
220698 20.550000 2019-04-30 16:58:41.209 2019-04-30 17:19:14.689 81.000000 Berry St at 4th St 37.775880 -122.393170 420.000000 SF Test Station 0.000000 0.000000 2535 Subscriber False 12793.228655
220730 21.766667 2019-04-30 16:55:35.924 2019-04-30 17:17:22.162 80.000000 Townsend St at 5th St 37.775235 -122.397437 420.000000 SF Test Station 0.000000 0.000000 462 Subscriber False 12793.604203
221021 12.733333 2019-04-30 16:44:12.351 2019-04-30 16:56:56.539 16.000000 Steuart St at Market St 37.794130 -122.394430 420.000000 SF Test Station 0.000000 0.000000 1880 Customer False 12792.596591
233272 24.850000 2019-04-28 18:40:41.288 2019-04-28 19:05:33.063 377.000000 Fell St at Stanyan St 37.771917 -122.453704 420.000000 SF Test Station 0.000000 0.000000 150 Customer False 12798.346860
233325 11.333333 2019-04-28 18:40:08.324 2019-04-28 18:51:28.357 139.000000 Garfield Square (25th St at Harrison St) 37.751017 -122.411901 420.000000 SF Test Station 0.000000 0.000000 311 Subscriber False 12795.764903
233440 12.733333 2019-04-28 18:10:38.170 2019-04-28 18:23:22.476 412.000000 Glen Park BART Station 37.732813 -122.434140 420.000000 SF Test Station 0.000000 0.000000 333 Customer False 12798.321047
233441 15.066667 2019-04-28 18:08:13.576 2019-04-28 18:23:18.367 412.000000 Glen Park BART Station 37.732813 -122.434140 420.000000 SF Test Station 0.000000 0.000000 2760 Customer False 12798.321047
233472 25.016667 2019-04-28 17:52:27.345 2019-04-28 18:17:28.410 377.000000 Fell St at Stanyan St 37.771917 -122.453704 420.000000 SF Test Station 0.000000 0.000000 3632 Subscriber False 12798.346860
233473 25.050000 2019-04-28 17:52:22.210 2019-04-28 18:17:25.545 377.000000 Fell St at Stanyan St 37.771917 -122.453704 420.000000 SF Test Station 0.000000 0.000000 901 Subscriber False 12798.346860
233492 14.000000 2019-04-28 18:00:23.567 2019-04-28 18:14:23.599 88.000000 11th St at Bryant St 37.770030 -122.411726 420.000000 SF Test Station 0.000000 0.000000 536 Subscriber False 12794.984465
258666 2.916667 2019-04-24 16:56:54.516 2019-04-24 16:59:50.429 420.000000 SF Test Station 0.000000 0.000000 81.000000 Berry St at 4th St 37.775880 -122.393170 1161 Subscriber False 12793.228655
258708 3.250000 2019-04-24 16:53:35.591 2019-04-24 16:56:51.006 104.000000 4th St at 16th St 37.767045 -122.390833 420.000000 SF Test Station 0.000000 0.000000 3498 Subscriber False 12793.393084
267605 15.350000 2019-04-23 13:14:09.492 2019-04-23 13:29:31.333 20.000000 Mechanics Monument Plaza (Market St at Bush St) 37.791300 -122.399051 420.000000 SF Test Station 0.000000 0.000000 1768 Subscriber False 12793.089079
305806 15.200000 2019-04-17 09:30:11.764 2019-04-17 09:45:24.263 399.000000 Bay St at Fillmore St 37.802636 -122.436289 420.000000 SF Test Station 0.000000 0.000000 1939 Subscriber False 12795.681002
1764863 421.600000 2019-10-29 16:38:20.379 2019-10-29 23:39:56.489 204.000000 55th St at Telegraph Ave 37.840186 -122.261822 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12137 Subscriber False 12779.884538
1774512 379.283333 2019-10-28 13:05:14.849 2019-10-28 19:24:32.640 20.000000 Mechanics Monument Plaza (Market St at Bush St) 37.791300 -122.399051 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 13051 Customer False 12793.089079
1776078 850.316667 2019-10-28 03:30:42.511 2019-10-28 17:41:02.050 96.000000 Dolores St at 15th St 37.766210 -122.426614 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 11491 Customer False 12796.358042
1778567 1005.333333 2019-10-27 18:10:23.938 2019-10-28 10:55:44.254 58.000000 Market St at 10th St 37.776619 -122.417385 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 1846 Customer False 12795.182445
1832977 1425.750000 2019-10-20 11:30:59.108 2019-10-21 11:16:44.516 147.000000 29th St at Tiffany Ave 37.744067 -122.421472 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 11476 Subscriber False 12796.829293
1837216 1099.750000 2019-10-20 00:07:21.632 2019-10-20 18:27:06.744 47.000000 4th St at Harrison St 37.780955 -122.399749 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12635 Customer False 12793.563150
1858443 20.850000 2019-10-17 11:55:04.979 2019-10-17 12:15:56.114 436.000000 McKinnon Ave at 3rd St 37.736296 -122.389970 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10865 Subscriber False 12794.560177
1888016 515.583333 2019-10-13 11:21:50.464 2019-10-13 19:57:25.842 66.000000 3rd St at Townsend St 37.778742 -122.392741 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12174 Customer False 12793.078228
1888020 282.150000 2019-10-13 15:11:20.084 2019-10-13 19:53:29.115 121.000000 Mission Playground 37.759210 -122.421339 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 11622 Customer False 12796.208113
1920703 2.933333 2019-10-09 12:21:15.400 2019-10-09 12:24:12.203 104.000000 4th St at 16th St 37.767045 -122.390833 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12320 Subscriber False 12793.393084
1936237 794.300000 2019-10-07 04:30:05.730 2019-10-07 17:44:24.686 239.000000 Bancroft Way at Telegraph Ave 37.868813 -122.258764 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 1121 Subscriber True 12778.484403
2140813 754.133333 2019-11-28 09:09:53.935 2019-11-28 21:44:02.823 493.000000 Folsom St at 7th St 37.776664 -122.407908 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10365 Customer False 12794.404402
2145595 516.050000 2019-11-26 11:59:15.370 2019-11-26 20:35:19.099 96.000000 Dolores St at 15th St 37.766210 -122.426614 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10781 Subscriber False 12796.358042
2146155 156.283333 2019-11-26 14:19:22.678 2019-11-26 16:55:39.976 356.000000 Valencia St at Clinton Park 37.769188 -122.422285 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 11027 Customer False 12795.883395
2146537 22.033333 2019-11-26 13:30:04.651 2019-11-26 13:52:07.140 104.000000 4th St at 16th St 37.767045 -122.390833 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10828 Subscriber False 12793.393084
2147137 1100.616667 2019-11-25 16:40:51.382 2019-11-26 11:01:28.665 485.000000 Funston Ave at Fulton St 37.773263 -122.471230 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12577 Customer False 12799.727782
2156374 620.450000 2019-11-24 20:06:49.737 2019-11-25 06:27:17.125 364.000000 China Basin St at 3rd St 37.772000 -122.389970 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 2691 Customer False 12793.122773
2165733 40.216667 2019-11-22 17:20:33.131 2019-11-22 18:00:46.193 363.000000 Salesforce Transit Center (Natoma St at 2nd St) 37.787492 -122.398285 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12323 Customer False 12793.179770
2176684 743.616667 2019-11-20 22:46:19.029 2019-11-21 11:09:56.400 17.000000 Embarcadero BART Station (Beale St at Market St) 37.792251 -122.397086 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 9543 Customer False 12792.889826
2180790 520.950000 2019-11-20 11:34:02.545 2019-11-20 20:15:00.509 21.000000 Montgomery St BART Station (Market St at 2nd St) 37.789625 -122.400811 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 9893 Customer False 12793.300708
2197186 611.116667 2019-11-18 11:07:36.196 2019-11-18 21:18:43.927 66.000000 3rd St at Townsend St 37.778742 -122.392741 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10289 Customer False 12793.078228
2197190 290.183333 2019-11-18 16:27:36.109 2019-11-18 21:17:47.368 10.000000 Washington St at Kearny St 37.795393 -122.404770 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10798 Customer False 12793.392476
2219024 15.200000 2019-11-15 12:10:35.861 2019-11-15 12:25:48.696 410.000000 Illinois St at Cesar Chavez St 37.750200 -122.386567 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 12746 Subscriber False 12793.721678
2235413 55.633333 2019-11-13 11:13:15.373 2019-11-13 12:08:53.728 93.000000 4th St at Mission Bay Blvd S 37.770407 -122.391198 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 9686 Subscriber False 12793.287553
2248906 133.700000 2019-11-11 16:58:24.252 2019-11-11 19:12:06.404 3.000000 Powell St BART Station (Market St at 4th St) 37.786375 -122.404904 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 9785 Customer False 12793.766945
2252541 1053.300000 2019-11-10 17:11:54.330 2019-11-11 10:45:13.151 81.000000 Berry St at 4th St 37.775880 -122.393170 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 10471 Customer False 12793.228655
2275899 1320.166667 2019-11-06 12:59:38.721 2019-11-07 10:59:49.654 419.000000 Fern St at Polk St 37.788197 -122.420373 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 9785 Subscriber False 12794.960311
2281079 1307.850000 2019-11-05 20:22:41.389 2019-11-06 18:10:33.253 102.000000 Irwin St at 8th St 37.766883 -122.399579 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 11865 Customer False 12794.116166
2284467 1301.800000 2019-11-05 12:23:34.230 2019-11-06 10:05:22.397 54.000000 Alamo Square (Steiner St at Fulton St) 37.777547 -122.433274 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 2339 Customer False 12796.446436
2292786 938.000000 2019-11-04 19:25:54.983 2019-11-05 11:03:55.039 12.000000 Pier 1/2 at The Embarcadero 37.796389 -122.394586 449.000000 16th Depot Bike Fleet Station 0.000000 0.000000 13022 Customer False 12792.518277

Note:

we can see here a misconception or typing mistakes when computing the distance, some end point lat and long are zeros which made their distance zero but it also maybe a round trip.Also we noticed some outliers 99.9% of our data are below 7km

In [39]:
print(df[df.distance_km==0].shape)
df[df.distance_km==0].head()
#this is the same starting and ending points
(55384, 15)
Out[39]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km
0 1014.383333 2019-09-30 11:48:02.710 2019-10-01 04:42:25.864 465.000000 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 465.000000 San Francisco Caltrain Station (King St at 4th... 37.776329 -122.394438 12904 Customer False 0.000000
10 5.983333 2019-09-30 23:45:14.434 2019-09-30 23:51:13.455 273.000000 Fulton St at Ashby Ave 37.855574 -122.263565 273.000000 Fulton St at Ashby Ave 37.855574 -122.263565 2750 Customer False 0.000000
26 6.833333 2019-09-30 23:22:01.113 2019-09-30 23:28:51.188 3.000000 Powell St BART Station (Market St at 4th St) 37.786375 -122.404904 3.000000 Powell St BART Station (Market St at 4th St) 37.786375 -122.404904 3647 Customer False 0.000000
48 5.783333 2019-09-30 23:02:15.039 2019-09-30 23:08:03.020 182.000000 19th Street BART Station 37.809013 -122.268247 182.000000 19th Street BART Station 37.809013 -122.268247 1715 Customer False 0.000000
49 12.450000 2019-09-30 22:53:01.559 2019-09-30 23:05:28.631 371.000000 Lombard St at Columbus Ave 37.802746 -122.413579 371.000000 Lombard St at Columbus Ave 37.802746 -122.413579 10311 Subscriber False 0.000000
In [40]:
print(df[df.end_station_latitude==0].shape)
df[df.end_station_latitude==0].head()
(48, 15)
Out[40]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km
219163 39.416667 2019-04-30 18:21:42.610 2019-04-30 19:01:08.146 50.000000 2nd St at Townsend St 37.780526 -122.390288 420.000000 SF Test Station 0.000000 0.000000 1612 Subscriber False 12792.805424
219929 32.733333 2019-04-30 17:31:19.646 2019-04-30 18:04:04.518 98.000000 Valencia St at 16th St 37.765052 -122.421866 420.000000 SF Test Station 0.000000 0.000000 3507 Subscriber False 12796.015813
220083 9.100000 2019-04-30 17:45:38.452 2019-04-30 17:54:45.258 370.000000 Jones St at Post St 37.787327 -122.413278 420.000000 SF Test Station 0.000000 0.000000 1142 Subscriber True 12794.414400
220162 10.500000 2019-04-30 17:39:33.507 2019-04-30 17:50:04.314 21.000000 Montgomery St BART Station (Market St at 2nd St) 37.789625 -122.400811 420.000000 SF Test Station 0.000000 0.000000 3500 Subscriber False 12793.300708
220615 7.216667 2019-04-30 17:17:10.486 2019-04-30 17:24:23.919 11.000000 Davis St at Jackson St 37.797280 -122.398436 420.000000 SF Test Station 0.000000 0.000000 1181 Subscriber False 12792.797705
In [41]:
print(df[df.start_station_latitude==0].shape)
df[df.start_station_latitude==0].head()
(1, 15)
Out[41]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km
258666 2.916667 2019-04-24 16:56:54.516 2019-04-24 16:59:50.429 420.000000 SF Test Station 0.000000 0.000000 81.000000 Berry St at 4th St 37.775880 -122.393170 1161 Subscriber False 12793.228655

Define:

  • droping rows where distance equals zero.
In [42]:
#code
df=df[df.distance_km!=0]
#test
print(df[df.distance_km==0].shape)
print(df.shape)
df[df.distance_km==0].head()
(0, 15)
(2207595, 15)
Out[42]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km

Define:

Remove outliers.

In [43]:
#code
df=df[df["distance_km"]<10_000]
#test
print(df[df.distance_km<10_000].shape)
print(df.shape)
df[df.distance_km>10_000].head()
(2207546, 15)
(2207546, 15)
Out[43]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km

Define:

adding date columns based on date.

In [44]:
#code
df["day"]=df.start_time.dt.dayofweek
days=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
for i in range(7):
    df.day.replace(to_replace=i,value=days[i],inplace=True)
#test
df[["start_time","day"]].sample(10) 
Out[44]:
start_time day
1971530 2019-10-02 18:24:25.864 Wednesday
2244038 2019-11-12 12:09:25.756 Tuesday
76263 2019-09-20 14:37:44.797 Friday
2423983 2019-02-14 18:23:31.842 Thursday
1871548 2019-10-15 19:26:36.296 Tuesday
739234 2019-03-28 21:58:56.767 Thursday
1537070 2019-05-03 14:08:11.292 Friday
552147 2019-07-14 16:40:50.113 Sunday
963315 2019-03-01 19:59:15.197 Friday
2363471 2019-02-23 00:07:58.706 Saturday
In [45]:
df.head(3)
Out[45]:
duration_min start_time end_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type bike_share_for_all_trip distance_km day
1 600.316667 2019-09-30 16:16:32.353 2019-10-01 02:16:51.982 294.000000 Pierce Ave at Market St 37.327581 -121.884559 443.000000 3rd St at Keyes St 37.320866 -121.876279 163 Customer False 1.045756 Monday
2 93.583333 2019-09-30 23:12:25.998 2019-10-01 00:46:01.959 370.000000 Jones St at Post St 37.787327 -122.413278 4.000000 Cyril Magnin St at Ellis St 37.785881 -122.408915 10737 Customer False 0.415781 Monday
3 24.700000 2019-09-30 23:57:34.663 2019-10-01 00:22:16.849 109.000000 17th St at Valencia St 37.763316 -122.421904 460.000000 Terry Francois Blvd at Warriors Way 37.769095 -122.386333 10346 Customer False 3.192105 Monday
In [46]:
df.info(verbose=True,null_counts=True)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2207546 entries, 1 to 2506982
Data columns (total 16 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   duration_min             2207546 non-null  float64       
 1   start_time               2207546 non-null  datetime64[ns]
 2   end_time                 2207546 non-null  datetime64[ns]
 3   start_station_id         2207546 non-null  float64       
 4   start_station_name       2207546 non-null  object        
 5   start_station_latitude   2207546 non-null  float64       
 6   start_station_longitude  2207546 non-null  float64       
 7   end_station_id           2207546 non-null  float64       
 8   end_station_name         2207546 non-null  object        
 9   end_station_latitude     2207546 non-null  float64       
 10  end_station_longitude    2207546 non-null  float64       
 11  bike_id                  2207546 non-null  int64         
 12  user_type                2207546 non-null  object        
 13  bike_share_for_all_trip  2207546 non-null  bool          
 14  distance_km              2207546 non-null  float64       
 15  day                      2207546 non-null  object        
dtypes: bool(1), datetime64[ns](2), float64(8), int64(1), object(4)
memory usage: 271.6+ MB
In [47]:
df=df[["duration_min","day","start_time","end_time","distance_km",\
"start_station_id","start_station_name","start_station_latitude","start_station_longitude"\
,"end_station_id","end_station_name","end_station_latitude","end_station_longitude","user_type","bike_share_for_all_trip"]]
df.to_csv("cleaned_combined.csv",index=False)

The Data

Each trip is anonymized and includes:

  • Trip Duration
  • Start Time and Date
  • End Time and Date
  • Start Station ID
  • Start Station Name
  • Start Station Latitude
  • Start Station Longitude
  • End Station ID
  • End Station Name
  • End Station Latitude
  • End Station Longitude
  • Bike ID
  • Bike Share for All Trip

    which tracks members who are enrolled in the Bike Share for All program for low-income residents.

  • User Type

    (Subscriber or Customer – “Subscriber” = Member or “Customer” = Casual)

In [48]:
# df=df[["duration_min","day","start_time","end_time","distance_km",\
# "start_station_id","start_station_name","start_station_latitude","start_station_longitude"\
# ,"end_station_id","end_station_name","end_station_latitude","end_station_longitude","user_type","bike_share_for_all_trip"]]
# df.to_csv("combined_tripdata.csv",index=False)
# df=pd.read_csv("combined_tripdata.csv")
# df.head()

Univariate visualization.

In [49]:
#create figure and axes object and setting number of columns of subplit grid
fig,(ax1,ax2)=plt.subplots(figsize=(12,5),ncols=2)
colors=["#BDD2A6","#C5E6A6"]
fig.suptitle('Portion of bike users in 2019', fontsize=14, fontweight='bold')
user_type=df.user_type.value_counts()
bike_sh=df.bike_share_for_all_trip.value_counts()
# first axes plotting Portion of user_type
_=ax1.pie(user_type,labels=user_type.index,startangle=90,counterclock=False,
       wedgeprops={'width':1},autopct='%1.1f%%',textprops={'fontsize': 14},
         colors=colors)
_=ax1.set_title("Portion of customers or subscriber",size=13,fontweight='bold')
# Equal aspect ratio ensures that pie is drawn as a circle
_=ax1.axis('equal')

# Second  axes plotting Portion of bike_sh
_=ax2.pie(bike_sh,labels=bike_sh.index,startangle=90,counterclock=False,
         wedgeprops={'width':1},autopct='%1.1f%%',textprops={'fontsize': 14},
          colors=colors)
_=ax2.set_title("enrolled in \"Bike Share for All program\""
               ,size=13,fontweight='bold')
# Equal aspect ratio ensures that pie is drawn as a circle
_=ax2.axis('equal')

Observation:

Right pie chart:

This pie chart shows the portion of users enrolled in "Bike Share for All Program" and the majority in the dataset are not. Bike Share for All is available to Bay Area residents ages 18 and older who qualify for Calfresh, SFMTA Lifeline Pass, or PG&E CARE utility discount.

Left pie chart:

This pie chart shows the portion of bike users wether They're a cCustomers or subscribers, the pie chart indicates that the majority are subscribed. The reason for that Probably because the fact that single rides start at \$2 for the first 30 minutes, then \\$2 per additional 15 minutes for the casual users whereas an annual membership has unlimited 45-minute Classic bike rides and discounted ebikes: Free unlocks and $0.15 per minute (waived if no Classic bikes available)

Resource: https://www.lyft.com/bikes/bay-wheels/bike-share-for-all

In [50]:
#create the plot
sb.set(rc={'figure.figsize':(12,5)})
dayofweek=['Monday', 'Tuesday','Wednesday','Thursday'\
 ,'Friday','Saturday', 'Sunday']
color=sb.color_palette()[0]
with plt.style.context('ggplot'):
    sb.countplot(data=df,x="day", color= color,order=dayofweek)
    plt.title("Most bike rental users in the week.")
    
    #add annotations
    n_points=df.shape[0]
    day_counts=df['day'].value_counts()
    locs,labels=plt.xticks()#get the current tick locations and labels
    
    #loop thru each pair of locations nd labels
    for loc,label in zip(locs,labels):

        #get the text property for the label to get the correct count
        count=day_counts[label.get_text()]
        pct_string='{:0.1f}%'.format(100*count/n_points)

        # print the annotation at the top of the bar 
        plt.text(loc,count-8,pct_string,ha='center',color='black',va='bottom',fontsize=12)

Observation:

The majority of bike ride are in the weekdays mostly Thursday 17.3% whereas on Saturday 9% and Sunday 8.1% as we can see a major drop of bike ride users.

In [51]:
np.percentile(df.duration_min,100)
Out[51]:
1436.95
In [52]:
(df.duration_min).describe()
Out[52]:
count   2207546.000000
mean         12.875548
std          29.637227
min           1.016667
25%           5.966667
50%           9.416667
75%          14.483333
max        1436.950000
Name: duration_min, dtype: float64
In [53]:
from matplotlib.ticker import PercentFormatter
weights=np.ones(df.shape[0])/df.shape[0]

_=plt.hist(x=df.duration_min,lw=0,bins=50,range=(0,60),weights=weights)
_=plt.axvline(x=(df.duration_min).mean(),color='red')
plt.gca().yaxis.set_major_formatter(PercentFormatter(.1))
_=plt.title("Frequent duration in minuts for bike users",size=15)
_=plt.ylabel("Frequency",size=15)
_=plt.xlabel("Minutes",size=15)
_=plt.legend(["mean","minutes"])

Observation:

Here is a right skewed distribution of duration in minutes for our data (aka positively skewd distribution). The majority falls between 3 and 7 minuts bike rids.

We have a mean of mean:12.8 and median:9.4 which is lower than the mean.

The reason that most bike users are below 60 is probably because bay-wheels bike share charges for rides longer than 60 minutes for $3 in each additional 15 minutes.

In [54]:
df.columns
Out[54]:
Index(['duration_min', 'day', 'start_time', 'end_time', 'distance_km',
       'start_station_id', 'start_station_name', 'start_station_latitude',
       'start_station_longitude', 'end_station_id', 'end_station_name',
       'end_station_latitude', 'end_station_longitude', 'user_type',
       'bike_share_for_all_trip'],
      dtype='object')
In [55]:
print("mean>median:",np.mean(df.duration_min),">",np.percentile(df.duration_min,50))
mean>median: 12.875547916102462 > 9.416666666666666
In [56]:
df[["distance_km"]].describe()
Out[56]:
distance_km
count 2207546.000000
mean 1.737403
std 1.056652
min 0.013287
25% 0.959250
50% 1.486212
75% 2.260752
max 69.469241
In [57]:
weights=np.ones(df.shape[0])/df.shape[0]

_=plt.hist(df.distance_km,bins=np.arange(0,5,.3),range=None,weights=weights,lw=1)
plt.gca().yaxis.set_major_formatter(PercentFormatter(.2))
_=plt.axvline(x=df.distance_km.mean(),color='red')
_=plt.title("Portion  distance in km for bike users",size=15)
_=plt.ylabel("Frequency",size=15)
_=plt.xlabel("Distance in km",size=15)
_=plt.xticks(np.arange(0,5,.3))
_=plt.legend(["mean","distance"])

Observation:

70% of 2019 bike users have distance range between 0.6 km to 1.5 km. 50% are below 1.5 km and we have an Outliers only minimum 0.004% users above 69 km. For distance we have a right skewed distribution, the mean is typically greater than the median. Also notice that the tail of the distribution on the right hand (positive) side is longer than on the left hand side.

In [58]:
print("mean>median:",np.mean(df.distance_km),">",np.percentile(df.distance_km,50))
mean>median: 1.7374028431730926 > 1.486212093293825
In [59]:
# df.to_csv("cleaned_combined.csv",index=False)

Bivariate visualization.

Exploring distribution of the dataset and variables using heatmap.

we will explore selected variable and try to see the correlation between each one using visualization.

In [60]:
pearsoncorr=df.copy()
pearsoncorr=pearsoncorr.corr(method='pearson')#new data frame that holds the pearson correlation
plt.figure(figsize=(15,8))#create a figure object
_=sb.heatmap(pearsoncorr,cmap='RdBu_r',
           annot=True,annot_kws={'size':12},
           linewidth=0.5,square=True)#visualization

plt.xticks(size=15);plt.yticks(size=15)#setting x & y tick size
_=plt.title("calculating the Pearson coefficient of correlation",size=17)# creating title

Observation:

There is a high correlation between start station latitude and end station longitude.

In [61]:
from matplotlib.colors import LogNorm
fig, ax = plt.subplots()#create figure and axis for the plot
# create 2d hist for start station latitude in the x axis and end station latitude in the y axis
h=ax.hist2d(x=df.start_station_latitude,y=df.end_station_latitude,cmap=plt.cm.RdYlBu_r)
plt.colorbar(mappable=h[3],ax=ax)
plt.title("Correlation between start_station_latitude and end_station_latitude ")
plt.ylabel("end_station_latitude")
plt.xlabel("start_station_latitude")
# h[3]-> get the image from the return value so it knows what colormap and limits for colorbar
Out[61]:
Text(0.5, 0, 'start_station_latitude')
In [62]:
fig,ax=plt.subplots(ncols=2,nrows=2,figsize=(16,12))#create 4 figures 
#rows,columns and inter these are to iterate over each figure 
rows=[0,1,0,1]
columns=[0,0,1,1]
inter=list(zip(rows,columns))
'''the below nested for loop will iterate over each axis of the figure
and draw a random 10k sample of duration and day '''
for j in range(len(inter)):# This will loop 4 times
    sam=df.sample(10_000)#set 10k sample df each loop
    for i in sam.index:#calculate days and duration for each figure
        if (sam['day'][i]=='Monday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#CB997E',label='Monday')
        elif (sam['day'][i]=='Sunday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#826754',label='Sunday')
        elif (sam['day'][i]=='Saturday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#8B0000',label='Saturday')
        elif (sam['day'][i]=='Friday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#BFEDC1',label='Friday')
        elif (sam['day'][i]=='Thursday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#D1553B',label='Thursday')
        elif (sam['day'][i]=='Wednesday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#DDBEA9',label='Wednesday')
        elif (sam['day'][i]=='Tuesday'):
            ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#A5A58D',label='Tuesday')

# lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
# lines, labels = [sum(lol, []) for lol in zip(*lines_labels)] 
# fig.legend(lines, labels)

fig.suptitle("Drawing 10k samples of days and duration minutes",fontsize=15)#title
fig.text(-.01,.5,'Minutes',fontsize=15)#y label position 
fig.text(.4,.01,'Day of the week',fontsize=15)# xlabel position
plt.show()#this will remove unwanted text 

Each day in every graph has the same color to compare them easily.

In [63]:
plt.figure(figsize=(20,5))
#plotting top 5 start station 
sb.boxplot(
    data=df.loc[df['start_station_name'].isin(df.start_station_name.value_counts().index[:5])],
    x='start_station_name',
    y='distance_km',
    color=color)
# plt.yscale('log')
plt.title('top 5 start station name corresponding to distance in km')
plt.xticks(rotation=90, ha='right')
plt.show()
In [64]:
bin_size=0.25
xbin_edges=np.arange(0.6,df.start_station_id.max()+bin_size,bin_size)
xbin_centers = (xbin_edges + bin_size/2)[:-1]

duration_binned=pd.cut(df.start_station_id,xbin_edges,right=False,include_lowest=True)
distance_mean=df['distance_km'].groupby(duration_binned).mean()
distance_sems=df['distance_km'].groupby(duration_binned).sem()
plt.errorbar(x=xbin_centers,y=distance_mean,yerr=distance_sems)
plt.title("Distance & start station id ")
plt.xlabel("start station id")
plt.ylabel("Distance")
plt.show()

Observation:

In the 2d histogram we visualize the variables that are highly correlated which are start_station latitude and end_station latitude.

The

Multivariate visualization.

In [65]:
# plt.figure(figsize=(4,20))
_=sb.lmplot(x='duration_min',y='distance_km',hue='user_type'
         ,data=df.loc[df['start_station_name'].isin(df.start_station_name.value_counts().index[:3])],fit_reg=False)
_=plt.yticks(np.arange(0,7.5,0.5))
_=plt.title("Duration corespondent to distance ")
In [66]:
df.start_station_name.value_counts().index[:3]
Out[66]:
Index(['Market St at 10th St', 'Berry St at 4th St',
       'San Francisco Caltrain (Townsend St at 4th St)'],
      dtype='object')
In [67]:
#for distance that are below 10km only because they're the majority
graph=sb.FacetGrid(df[df["distance_km"]<10],hue='bike_share_for_all_trip',col='user_type')
graph.map(plt.scatter,'distance_km','duration_min')
graph.add_legend()
plt.show()
In [68]:
from scipy import stats
def quantile_plot(x, **kwargs):
    qntls, xr = stats.probplot(x, fit=False)
    plt.scatter(xr, qntls, **kwargs)

g = sb.FacetGrid(df, col="user_type", height=4)
g.map(quantile_plot, "bike_share_for_all_trip");
plt.xticks([0,1],["True","False"])
plt.ylabel('Frequency')
Out[68]:
Text(308.57374999999996, 0.5, 'Frequency')
In [69]:
df.query("user_type=='Customer' and bike_share_for_all_trip==@True")
Out[69]:
duration_min day start_time end_time distance_km start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude user_type bike_share_for_all_trip
In [70]:
q=df.sample(1_000)
In [71]:
q
Out[71]:
duration_min day start_time end_time distance_km start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude user_type bike_share_for_all_trip
1975951 5.533333 Wednesday 2019-10-02 09:51:47.970 2019-10-02 09:57:20.074 0.863776 30.000000 San Francisco Caltrain (Townsend St at 4th St) 37.776598 -122.395282 36.000000 Folsom St at 3rd St 37.783830 -122.398870 Customer False
91891 14.550000 Wednesday 2019-09-18 18:36:06.854 2019-09-18 18:50:39.987 2.362041 61.000000 Howard St at 8th St 37.776513 -122.411306 127.000000 Valencia St at 21st St 37.756708 -122.421025 Subscriber False
1145421 1.800000 Monday 2019-06-03 19:36:20.177 2019-06-03 19:38:08.353 0.256257 310.000000 San Fernando St at 4th St 37.335885 -121.885660 280.000000 San Fernando St at 7th St 37.337122 -121.883215 Subscriber True
929352 15.566667 Thursday 2019-03-07 09:13:21.340 2019-03-07 09:28:55.496 2.491810 16.000000 Steuart St at Market St 37.794130 -122.394430 364.000000 China Basin St at 3rd St 37.772000 -122.389970 Customer False
1177751 13.116667 Thursday 2019-08-29 16:02:36.586 2019-08-29 16:15:44.492 2.572536 338.000000 13th St at Franklin St 37.803189 -122.270579 212.000000 Mosswood Park 37.824893 -122.260437 Customer False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20010 11.166667 Friday 2019-09-27 17:05:23.566 2019-09-27 17:16:33.925 1.827058 23.000000 The Embarcadero at Steuart St 37.791464 -122.391034 6.000000 The Embarcadero at Sansome St 37.804770 -122.403234 Subscriber False
1132016 25.183333 Wednesday 2019-06-05 16:15:38.093 2019-06-05 16:40:49.149 1.391479 197.000000 El Embarcadero at Grand Ave 37.808848 -122.249680 181.000000 Grand Ave at Webster St 37.811377 -122.265192 Customer False
907233 12.416667 Sunday 2019-03-10 13:21:38.297 2019-03-10 13:34:04.279 1.310345 15.000000 San Francisco Ferry Building (Harry Bridges Pl... 37.795392 -122.394203 6.000000 The Embarcadero at Sansome St 37.804770 -122.403234 Customer False
1189277 15.216667 Wednesday 2019-08-28 08:16:29.398 2019-08-28 08:31:43.371 2.580121 22.000000 Howard St at Beale St 37.789756 -122.394643 102.000000 Irwin St at 8th St 37.766883 -122.399579 Subscriber False
322553 55.100000 Sunday 2019-04-14 14:28:03.423 2019-04-14 15:23:10.090 3.103496 42.000000 San Francisco City Hall (Polk St at Grove St) 37.778650 -122.418230 399.000000 Bay St at Fillmore St 37.802636 -122.436289 Customer False

1000 rows × 15 columns

In [72]:
df['month'] = df['start_time'].dt.month
In [73]:
df['start_time'].dt.month
Out[73]:
1          9
2          9
3          9
4          9
5          9
          ..
2506978    2
2506979    2
2506980    2
2506981    2
2506982    2
Name: start_time, Length: 2207546, dtype: int64
In [74]:
fig = plt.figure(figsize = (12,6))
sb.pointplot(data = df, x = 'month' , y = 'duration_min' , hue = 'user_type', palette = 'Reds')
plt.title('Trip duartion across user type in different months')

plt.ylabel('Mean  Dduration ')
# plt.xticks([1,2,3,4,5,6,7,8,9])
plt.xlabel('Month')
plt.show()
In [75]:
df.to_csv("cleaned_combined.csv",index=False)

Observation:

There is not a single user who's a customer and inrolled in bike for all program at least in 2019.Also, when plotting the mean duration across months we can see that the customers have higher duration.

source: https://seaborn.pydata.org/tutorial/axis_grids.html